In [1]:
from IPython.display import HTML

HTML('''<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>''')
Out[1]:

StackOverflow is a platform owned by the StackExchange Network where students, professionals, and enthusiasts exchange information regarding programming through posts, queries, and comments. It also serves as a platform for programmers and businesses to collaborate and work on projects. 


With roughly 18 million posted questions, it is a comprehensive network of programmers across the field. Given this wide network of programmers, StackOverflow has been the ‘go to’ website for programmers to clarify and further understand programming algorithms and its syntax

Methodology

The general framework of the study is as follows:

Data gathering

  • Data source
  • Web scraping
  • Random sampling
  • Sampling validation

Two-step clustering

  1. Numerical clustering
    • Exploratory data analysis
    • Data processing
    • Dimensionality reduction
    • Clustering
  2. Non-numerical clustering
    • Vectorization
    • Dimensionality reduction
    • Clustering
  3. Results
In [67]:
#preliminaries

import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from wordcloud import WordCloud

import requests
import time
from bs4 import BeautifulSoup
import sqlite3
import seaborn as sns
import re
import os
from functools import reduce

import psycopg2
from xml.etree import ElementTree

import random
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from IPython.display import clear_output

from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE

from scipy.spatial.distance import euclidean, cityblock
from sklearn.metrics import (adjusted_mutual_info_score, adjusted_rand_score, 
                             calinski_harabaz_score, silhouette_score)
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import TfidfVectorizer
# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords

import sqlite3
from matplotlib.colors import LinearSegmentedColormap

%matplotlib inline

The dataset used for this analysis was collected from the stackexchange repository: https://archive.org/details/stackexchange. The xml files were extracted and stored into an SQL database.

In [21]:
def FiletoDBLoad(filename, tablename, conn):
    
    if tablename == 'Posts':
        !echo "<?xml version=\"1.0\" encoding=\"utf-8\"?>" > .temp.txt
        !echo "<posts>" >> .temp.txt
        !cat {filename} |grep "PostTypeId=\"1\"" >> .temp.txt
        !echo "</posts>" >> .temp.txt
        filename = '.temp.txt'

            
    with open(filename, 'r', encoding='utf-8') as f:
        header = f.read().split('\n')[2]
    header = re.findall(r'(\S*?)=',header)
    
    tree = ElementTree.parse(filename)
    xml_data = tree.getroot()
    table = []
    for elem in xml_data:
        row = {}
        for j in header:
            try:
                col = str(f'{header.index(j):02d}')+'_'+j
                row[col] = elem.attrib[j]
            except:
                continue
        table.append(row)
    df_table = pd.DataFrame(table)
    df_table.columns = [i[3:] for i in df_table.columns]
    df_table.to_sql(tablename, con=conn, if_exists='append', index=False)
In [22]:
conn = sqlite3.connect('/mnt/processed/private/msds2020/lt13/database/stackoverflow.db')
cur = conn.cursor()

for f in files:
    FiletoDBLoad(filename=f,tablename='Posts',conn=conn)

For the purposes of this project, the top StackOverflow questions were scraped with the same information as the XML files.

In [19]:
# Proxy servers for web scraping

os.environ['HTTP_PROXY'] = 'http://165.22.32.8:80'
os.environ['HTTPS_PROXY'] = 'https://66.70.167.122:3128'

req_header = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
accept-encoding: gzip, deflate, br
accept-language: en-US,en;q=0.9
cache-control: max-age=0
cookie: _ga=GA1.2.1223690127.1554361835; __qca=P0-1087024619-1554361834541; __gads=ID=95575f8f21b13b6a:T=1554361834:S=ALNI_MYzrAP4MO9xO3-RuodMxFKsUvXijA; notice-ctt=4%3B1554364020909; prov=5a1477ef-3607-2d87-c83b-4dadc0666ec0; _gid=GA1.2.229546230.1560849365; acct=t=sHvGyo1FYK9EJrJ2RTE92i%2bijUmMLbgd&s=ODXb%2b%2f3uMcMKfe3z1cbrgAlLUsX%2bAdVM; _gat=1
referer: https://stackoverflow.com/
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'''

headers = {i.lower(): j for i,j in re.findall(r'(.*?): (.*?)$', req_header, re.M)} #re.M --> multiline


url = 'https://stackoverflow.com/?tab=month'
resp = requests.get(url, headers = headers)

resp_soup = BeautifulSoup(resp.text, 'lxml')

links = [i.get('href')
         for i in resp_soup.select('div.summary a.question-hyperlink')]
N_link = len(links)
data = []
failed_links = []
count = 0
In [ ]:
t = 2

while links:
    link = links[0]
    url_2 = 'https://stackoverflow.com/' + link
    while True:
        try:
            resp_2 = requests.get(url_2, headers=headers)
            if resp_2.status_code == 200:
                break
            else:
                print("Red status code")
                time.sleep(5)
        except Exception as e:
            print("Pause for 5 seconds before requesting again")
            time.sleep(5)
    resp_soup2 = BeautifulSoup(resp_2.text, 'lxml')

    title = resp_soup2.select_one('div#question-header > h1 > a').text
    tags = [i.text for i in resp_soup2.select(
        'div.post-taglist > div > a')]
    tags = ''.join([f'<{i}>' for i in tags])
    
    # post id
    row_ID = int(re.findall(r'/questions/(\d+)/', link)[0])
    post_type_ID = 1
    accepted_ans_ID = resp_soup2.findAll(
        'div', {'itemprop': 'acceptedAnswer'})
    if accepted_ans_ID:
        accepted_ans_ID = accepted_ans_ID[0]['data-answerid']
    else:
        accepted_ans_ID = ''

    # post statistics
    question_stats = resp_soup2.select('div.module.question-stats b')
    creation_date = question_stats[0].time['datetime']
    view_count = int(re.findall(
        r'[0-9,]+', question_stats[1].text)[0].replace(',', ''))
    last_act_date = ''
    if len(question_stats) == 3:
        last_act_date = question_stats[2].a['title'].replace(
            ' ', 'T').replace('Z', '')

    #score and body
    score = int(resp_soup2.select_one(
        'div.js-vote-count.grid--cell.fc-black-500.fs-title.grid.fd-column.ai-center').text)
    body = ''.join(list(map(str, resp_soup2.select('div.post-text p'))))

    # user id and editor id
    user_dets_main = resp_soup2.select_one(
        'div.mt16.pt4.grid.gs8.gsy.fw-wrap.jc-end.ai-start')
    user_dets = user_dets_main.select('div.user-details')
    owner_ID = re.findall(r'/users/(\d+)/', user_dets[-1].a['href'])[0]
    if len(user_dets) == 2:
        edit_date = resp_soup2.select(
            'div.question div.user-action-time')[0].span['title'].replace(' ', 'T').replace('Z', '')
        if user_dets[0].has_attr('itemprop'):
            editor_ID = owner_ID
        else:
            try:
                editor_ID = re.findall(
                    r'/users/(\d+)/', user_dets[0].a['href'])[0]
            except TypeError:
                editor_ID = ''
    else:
        edit_date = ''
        editor_ID = ''

    # answer and comment count
    ans_count = int(resp_soup2.select_one(
        'div.subheader.answers-subheader').h2['data-answercount'])
    comm = resp_soup2.select('div.question div.comments ul')
    if comm[0]['class'][0] == 'close-as-off-topic-status-list':
        comm.pop(0)
    comm_hidden = int(comm[0][
                      'data-remaining-comments-count'])
    comm_shown = len(resp_soup2.select('div.question span.comment-copy'))
    comm_count = comm_hidden + comm_shown

    data.append((row_ID, post_type_ID, accepted_ans_ID, creation_date, score,
                 view_count, body, owner_ID, editor_ID, edit_date,
                 last_act_date, title, tags, ans_count, comm_count))
    time.sleep(t)
    clear_output()
    
    count+=1
    print(f'{100 * (N_link) / 120:.2f}% done')
    links.remove(link)
In [23]:
df = pd.DataFrame(data, columns=['row Id', 'PostTypeId', 'AcceptedAnswerId',
                                 'CreationDate', 'Score', 'ViewCount', 'Body',
                                 'OwnerUserId', 'LastEditorUserId', 'LastEditDate',
                                 'LastActivityDate', 'Title', 'Tags', 'AnswerCount',
                                 'CommentCount']).astype(str)
df.head()
Out[23]:
row Id PostTypeId AcceptedAnswerId CreationDate Score ViewCount Body OwnerUserId LastEditorUserId LastEditDate LastActivityDate Title Tags AnswerCount CommentCount
0 56738380 1 56738667 2019-06-24T14:08:14 110 12753 <p>Since C++ 17 one can write an <code>if</cod... 4850111 63550 2019-06-26T17:36:50 2019-07-09T13:34:20 Most elegant way to write a one-shot 'if' <c++><if-statement><c++17> 8 14
1 56807112 1 56812838 2019-06-28T12:38:46 82 7707 <p>Today I started to receive this error with ... 5790492 1032372 2019-06-28T18:42:18 2019-07-08T14:14:43 Xcode ERROR ITMS-90783: “Missing bundle displa... <xcode><testflight><fastlane><appstoreconnect> 7 1
2 56866458 1 56866529 2019-07-03T08:58:55 59 4166 <p>I read that in C++17 we can initialize vari... 10858827 3628942 2019-07-10T10:23:55 2019-07-10T10:27:18 Initializing variables in an “if” statement <c++><c++17> 6 11
3 56692117 1 56692435 2019-06-20T18:43:16 124 6596 <p>When using the same code, simply changing t... 9419412 63550 2019-06-27T12:21:35 2019-06-27T12:21:35 Why is C++ initial allocation so much larger t... <c++><c><benchmarking> 2 11
4 56642369 1 56642520 2019-06-18T05:40:29 34 4973 <p>I updated 'android.support:appcompat-v7' to... 11212074 11212074 2019-07-10T14:09:29 2019-07-10T14:09:29 Android Material and appcompat Manifest merger... <android><react-native><react-native-android><... 13 6

Due to the size of the dataset ($18M$ questions, $1.3M$ python questions), $10000$ samples were extracted which will be the corpus of interest.

In [24]:
max_id = 56412356

stop_words = stopwords.words('english') + ['python', 'using', 'use']
for trial_no in range(20):
    print(trial_no)
    idxs = list(range(max_id//1000))
    posts = []
    while len(posts) < 1e4:
        idx = random.choice(idxs)
        cur.execute(f"""SELECT DISTINCT * FROM PostsQuestions 
                    WHERE tags LIKE "%python%"
                    AND AcceptedAnswerId IS NOT NULL
                    AND Id BETWEEN {idx*1000} AND {idx*1000+999}
                    AND Id LIKE '{idx}___'""")
        posts.extend(cur.fetchall())
        idxs.remove(idx)

    tags = list(zip(*posts))
    ids, titles = tags[0], tags[11]

    tfidf_vectorizer = TfidfVectorizer(token_pattern='\w{2,}',
                                       stop_words=stop_words,
                                       min_df=0.001,
                                       binary=True)
    bow = tfidf_vectorizer.fit_transform(titles)

    V = np.cov(bow.toarray(), rowvar=False)

    lambdas, w = np.linalg.eig(V)
    indices = np.argsort(lambdas)[::-1]
    lambdas = lambdas[indices]
    w = w[:, indices]
    var_explained = lambdas / lambdas.sum()
    for m in range(len(var_explained)):
        if var_explained[m] < 0.1*var_explained[0]:
            break
    m += 1
    cum_var_explained = var_explained.cumsum()

    n_PC = np.searchsorted(cum_var_explained, 0.8)

    X = TruncatedSVD(n_components=n_PC).fit_transform(bow)
    X_m = TruncatedSVD(n_components=m).fit_transform(bow)

    feats = tfidf_vectorizer.get_feature_names()

    files_ = {'bow':bow, 'feats':feats, 'posts':posts,
              'trunc_elbow':(m, X_m), 'trunc_0.8':(n_PC, X)}
    pickle.dump(files_, open(f'/mnt/processed/private/msds2020/lt13/pickle/robustness_trials/T{trial_no}', 'wb'))

However, due to the random nature of sampling, robustness of the sampling must be checked. In particular, the features and their weights were tested as to whether the results are similar for the different trials. This was done for 20 trials. Two validations are performed.

  1. Jaccard similarity, $\frac{A\cap B}{A\cup B}$. Using this, the total feature weights of each sampled corpus were compared with each other.
  2. Standard deviation of top features. The weight of the top features are compared across different sampled corpus.
In [27]:
T = []
for t in range(20):
    T.append(pickle.load(open(f'/mnt/processed/private/msds2020/lt13/pickle/robustness_trials/T{t}', 'rb')))
wgts_base = []
feats_base = []
for t in T:
    wgts_base.extend(list(t['bow'].toarray().sum(axis=0)))
    feats_base.extend(t['feats'])
    
top_idx = np.argsort(np.array(wgts_base))[::-1]
top_fts = list(zip(np.array(feats_base)[top_idx], np.array(wgts_base)[top_idx]))
top_fts = list(dict(top_fts).keys())[:10]
print(top_fts)
['list', 'django', 'file', 'error', 'function', 'string', 'pandas', 'get', 'data', 'values']
In [61]:
fig, ax = plt.subplots(1, 2, figsize=(16,6))
df_feat = pd.DataFrame()
for i in top_fts:
    fts_wgt = []
    for t in T:
        try:
            fts_idx = t['feats'].index(i)
            fts_wgt.append(t['bow'].toarray()[:,fts_idx].sum())
        except:
            fts_wgt.append(0)
    df_feat[i] = fts_wgt

sns.boxplot(data=df_feat, ax=ax[0])
ax[0].set_ylabel("Sum of TF-IDF weights")
ax[0].set_xlabel("Top features")

def jaccard(A, B, feats):
    """A, B dict"""
    jacc_i = 0
    for i in feats:
        if A[i] > 0:
            jacc_i += np.min([A[i], B.get(i, 0)]) / np.max([A[i], B.get(i, 0)])
        else:
            print(i)
    jacc_i /= len(feats)
    return jacc_i

jac_sim = np.zeros((20, 20))
for i in range(20):
    data_0 = pickle.load(open(f'/mnt/processed/private/msds2020/lt13/pickle/robustness_trials/T{i}', 'rb'))

    bow0 = data_0['bow']
    wgt_0 = bow0.sum(axis=0)
    feats_0 = data_0['feats']
    A = dict(zip(feats_0, np.array(wgt_0).reshape(wgt_0.shape[1])))
    for n in range(i, 20):
        data_n = pickle.load(open(f'/mnt/processed/private/msds2020/lt13/pickle/robustness_trials/T{n}', 'rb'))

        bown = data_n['bow']
        wgt_n = bown.sum(axis=0)
        feats_n = data_n['feats']
        B = dict(zip(feats_n, np.array(wgt_n).reshape(wgt_n.shape[1])))

        jac_sim[i, n] = jaccard(A, B, feats_0)
im = ax[1].imshow(jac_sim, cmap='Blues')
# ax[1].axis('off')
ax[1].set_xlabel("Sample")
ax[1].set_ylabel("Sample")
ax[1].set_xticks([i for i in range(0, 21, 2)])
ax[1].set_yticks([i for i in range(0, 21, 2)])
ax[1].set_xlim(0,19)
ax[1].set_ylim(0,19)
ax[1].set_xticklabels([i+1 for i in range(0, 20, 2)])
ax[1].set_yticklabels([i+1 for i in range(0, 20, 2)])
plt.colorbar(im, ax=ax[1])
fig.savefig('robustness.png', dpi=300)
  • From the standard deviation of top weights validation, the values are within $2\sigma$ for each feature.
  • For the Jaccard similarity, the pair-wise similarity of the samples were 0.7 at the minimum.

Both observations indicate high similarity of the sampled corpora.

Exploratory data analysis

The dataset has 5 numerical features:

  1. ViewCount - number of views a post has
  2. Score - the difference of the number of upvotes and the number of downvotes
  3. AnswerCount - the number of answer posts in the post
  4. CommentCount - the number of comments in the question post
  5. FavoriteCount - the number of favorites in the post
In [41]:
X_0 = pickle.load(open(f'/mnt/processed/private/msds2020/lt13/pickle/robustness_trials/T3', 'rb')) 

df_X0 = pd.DataFrame(X_0['posts'], columns = ['Id', 'PostTypeId', 'AcceptedAnswerId', 
                     'CreationDate', 'Score', 'ViewCount', 'Body', 'OwnerUserId', 
                     'LastEditorUserId', 'LastEditDate', 'LastActivityDate', 
                     'Title', 'Tags', 'AnswerCount', 'CommentCount', 
                     'FavoriteCount', 'CommunityOwnedDate'])
X0_num = df_X0[['Score','ViewCount','AnswerCount','CommentCount','FavoriteCount']].fillna(0).astype(int)
Pairplot of the numerical features
In [45]:
sns.pairplot(X0_num)
Out[45]:
<seaborn.axisgrid.PairGrid at 0x7f34c84992b0>
The distribution of the post scores
In [48]:
sns.distplot(X0_num['Score'].apply(lambda x: np.log(x+1-X0_num['Score'].min())))
Out[48]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f34ae5a4c50>
The distribution of the view counts
In [47]:
sns.distplot(X0_num['ViewCount'].apply(lambda x: np.log(x+1)))
Out[47]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f34ae64a470>

Looking at the scores and viewcounts, most of the data are in tens or hundreds. There are a few, however, that are orders of magnitude higher than these.

Numerical clustering: data processing

Based on the results of the EDA, log-scaling is appropriate for the given data.

In [49]:
X0_num['Score'] = X0_num['Score'].apply(lambda x: np.log(x+1-X0_num['Score'].min()))
X0_num['ViewCount'] = X0_num['ViewCount'].apply(lambda x: np.log(x+1))
In [52]:
Xnum_data = MinMaxScaler().fit_transform(X0_num.to_numpy())
Xnum_data
Out[52]:
array([[0.38346913, 0.42351782, 0.        , 0.03571429, 0.        ],
       [0.41598983, 0.53576353, 0.        , 0.21428571, 0.00959233],
       [0.37588586, 0.42557362, 0.08      , 0.17857143, 0.        ],
       ...,
       [0.47169235, 0.4256142 , 0.04      , 0.        , 0.00719424],
       [0.40392049, 0.38613342, 0.12      , 0.10714286, 0.00479616],
       [0.38346913, 0.47982782, 0.08      , 0.        , 0.        ]])

Dimensionality reduction

Now that we have our design matrix, dimensionality reduction should be used to observe the effect of the number of principal components necessary.

In [56]:
V = np.cov(Xnum_data, rowvar=False)
lambdas, w = np.linalg.eig(V)
indices = np.argsort(lambdas)[::-1]
lambdas = lambdas[indices]
w = w[:, indices]
var_explained = lambdas / lambdas.sum()
fig, ax = plt.subplots(1, 2, figsize=(16,5))
ax[0].plot(np.arange(1, len(var_explained)+1), var_explained, lw=3)
ax[0].set_xlabel('Number of features', size=14)
ax[0].set_ylabel('Variance explained', size=14);

cum_var_explained = var_explained.cumsum()
# fig, ax = plt.subplots(figsize=(12,6))
n_PC = np.searchsorted(cum_var_explained, 0.8)
ax[1].plot(np.arange(1, len(cum_var_explained)+1), 
           cum_var_explained, '-', zorder=1, lw=3)
ax[1].set_ylim(ymin=0)
#ax[1].plot([-50, n_PC], [0.8, 0.8], 'r--')
# ax[1].plot([n_PC, n_PC], [0, 0.8], 'r--')
# ax[1].scatter(n_PC, 0.8, c='r', zorder=2)
#ax[1].set_xlim(-50,)
ax[1].set_xlabel('Features', size=14)
ax[1].set_ylabel('Cumulative variance explained', size=14);

rate = (var_explained[:-1] - var_explained[1:])/var_explained[:-1]
for m in range(len(rate)):
    if rate[m] < 0.001:
        break
m += 1
# ax[0].axvline(m, c='r');
# print(m)
/opt/conda/lib/python3.6/site-packages/matplotlib/axes/_base.py:3604: MatplotlibDeprecationWarning: 
The `ymin` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `bottom` instead.
  alternative='`bottom`', obj_type='argument')

Using the standard of having $80%$ of the variance captured, the design matrix will be truncated into 2 components.

In [54]:
Xnum_trunc = TruncatedSVD(n_components=2).fit_transform(Xnum_data)

Clustering

For this analysis, K-Means is used to cluster the stackoverflow questions based on their numerical features and further verify it through internal validations

In [65]:
def intra_to_inter(X, y, dist, r):
    p_list = []
    q_list = []
    z = np.random.choice(X.shape[0], size=(r, 2))
    for i, j in z:
        if i == j:
            continue
        elif y[i] == y[j]:
            p_list.append(dist(X[i], X[j]))
        else:
            q_list.append(dist(X[i], X[j]))
    return (np.average(p_list) / np.average(q_list))

def cluster_range(X, clusterer, k_start, k_stop):
    # YOUR CODE HERE
    ys = []
    inertias = []
    chs = []
    scs = []
    iidrs=[]
    
    for k in range(k_start, k_stop+1):
        print('Executing k=', k)
        kmeans_X = KMeans(n_clusters=k, random_state=clusterer.random_state);
        y_predict_X = kmeans_X.fit_predict(X)
        ys.append(y_predict_X)
        inertias.append(kmeans_X.inertia_);
        iidrs.append(intra_to_inter(X, y_predict_X, euclidean, 50))
        chs.append(calinski_harabaz_score(X, y_predict_X));
        scs.append(silhouette_score(X,  y_predict_X));            
    
    cluster = {}
    cluster['ys'] = ys
    cluster['inertias'] = inertias
    cluster['chs'] = chs
    cluster['iidrs'] = iidrs
    cluster['scs'] = scs
    return cluster
In [ ]:
k_start, k_stop = (2, 10)
kmeans_cluster = cluster_range(Xnum_data, KMeans(random_state=1337), 
                               k_start=k_start, k_stop=k_stop)  
Internal validations
In [68]:
fig, ax = plt.subplots(1,5, figsize=(16,3))
ax[0].plot(range(k_start, k_stop+1), kmeans_cluster['inertias'], 'o--')
ax[0].set_xlabel('k')
ax[0].set_title('inertias');
ax[0].grid(which='both')
ax[1].plot(range(k_start, k_stop+1), kmeans_cluster['iidrs'], 'o--')
ax[1].set_xlabel('k')
ax[1].set_title('intra-to-inter');
ax[1].grid(which='both')
ax[2].plot(range(k_start, k_stop+1), kmeans_cluster['chs'], 'o--')
ax[2].set_xlabel('k')
ax[2].set_title('calinski-harabaz score');
ax[2].grid(which='both')
ax[3].plot(range(k_start, k_stop+1), kmeans_cluster['scs'], 'o--')
ax[3].set_xlabel('k')
ax[3].set_title('silhouette score');
ax[3].grid(which='both')
k=5
ax[0].axvline(k, c='r', ls='--');
ax[1].axvline(k, c='r', ls='--');
ax[2].axvline(k, c='r', ls='--');
ax[3].axvline(k, c='r', ls='--');
sizes = []
for i in range(k_stop-k_start+1):
    sizes.append(list(np.unique(kmeans_cluster['ys'][i], return_counts=True)[1]))
for i in range(len(sizes)):
    j_prev = 0
    colors = list(zip(*[np.linspace(0,0.8,len(sizes[i])+1) for _ in range(3)]))

    n = 0
    for j in sizes[i]:
        ax[4].bar(i+1, j, width=0.6, bottom=j_prev, color=colors[n])
        j_prev += j
        n+=1
ax[4].set_title('balance measure')

fig.savefig('internal_valid.png', dpi=300)

From the internal validations, taking into account the tradeoffs since we want low inertias, low intra-to-inter, high calinski-harabasz, and high silhouette, $k=5$ is chosen.

In [93]:
path_ = '/mnt/processed/private/msds2020/lt13/pickle/james/'
Xnum_trunc = pickle.load(open(path_+'Xnum_trunc', 'rb'))
w = pickle.load(open(path_+'Xnum_w', 'rb'))
y = pickle.load(open(path_+'Xnum_y', 'rb'))
X_tsne0 =  pickle.load(open(path_+'X3_bow_ts0', 'rb'))
y_0 = pickle.load(open(path_+'y_kmeans0', 'rb'))
X_tsne1 =  pickle.load(open(path_+'X3_bow_ts1', 'rb'))
y_1 = pickle.load(open(path_+'y_kmeans1', 'rb'))
X_tsne2 =  pickle.load(open(path_+'X3_bow_ts2', 'rb'))
y_2 = pickle.load(open(path_+'y_kmeans2', 'rb'))
X_tsne3 =  pickle.load(open(path_+'X3_bow_ts3', 'rb'))
y_3 = pickle.load(open(path_+'y_kmeans3', 'rb'))
X_tsne4 =  pickle.load(open(path_+'X3_bow_ts4', 'rb'))
y_4 = pickle.load(open(path_+'y_kmeans4', 'rb'))
In [94]:
plt.figure(figsize=(9,15), dpi=200)
plt.suptitle('Posts Measures and Themes Clustering')
gs = gridspec.GridSpec(5, 3)
ax1 = plt.subplot(gs[:-2,:])
ax1.scatter(Xnum_trunc[:,0], -Xnum_trunc[:,1], alpha=0.8, c=y);
ax1.set_axis_off(); 
for feat, feat_name in zip(w, ['Score(log)','ViewCount(log)','AnswerCount','CommentCount','FavoriteCount']):
    ax1.arrow(0.2, -0.4, .5*feat[0], .5*feat[1], color='k', width=0.003, ec='none')
    if not feat_name.endswith('(log)'):
        ax1.text(0.22+.5*feat[0], -0.38+.5*feat[1], feat_name, ha='center', color='k')
    else:
        ax1.text(0.3+.5*feat[0], -0.4+.5*feat[1], feat_name, ha='center', color='k')
ax2 = plt.subplot(gs[3,0])
ax2.scatter(X_tsne0[:,0], X_tsne0[:,1], alpha=0.8, c=y_0, cmap='Purples');
ax2.annotate(f'{len(y_0)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax2.set_title('Mod Views, High Eng')
ax2.set_axis_off(); 
ax3 = plt.subplot(gs[3,1])
ax3.scatter(X_tsne1[:,0], X_tsne1[:,1], alpha=0.8, c=y_1, cmap='Blues');
ax3.annotate(f'{len(y_1)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax3.set_title('Mod Views, Low Eng')
ax3.set_axis_off(); 
ax4 = plt.subplot(gs[3,2])
ax4.scatter(X_tsne2[:,0], X_tsne2[:,1], alpha=0.8, c=y_2, cmap='BuGn');
ax4.annotate(f'{len(y_2)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax4.set_title('Low Views, Low Eng')
ax4.set_axis_off(); 
ax5 = plt.subplot(gs[4,0])
ax5.scatter(X_tsne3[:,0], X_tsne3[:,1], alpha=0.8, c=y_3, cmap='Greens');
ax5.annotate(f'{len(y_3)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax5.set_title('Low Views, High Eng')
ax5.set_axis_off(); 
ax6 = plt.subplot(gs[4,1])
ax6.scatter(X_tsne4[:,0], X_tsne4[:,1], alpha=0.8, c=y_4, cmap='YlOrRd');
ax6.annotate(f'{len(y_4)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax6.set_title('High Views, High Eng')
ax6.set_axis_off(); 
fig.savefig('fig1.png', dpi=500)

Looking at the five clusters and the features for each cluster, each cluster were classified according to their viewership and engagement levels.

For each of the five clusters from the numerical features, the titles and tags in the stackoverflow post questions were used as features to identify the prominent topics for each cluster.

TF-IDF vectorizer

Since words have no concept of distance on their own, the posts must be vectorized. For this, TF-IDF is used. The post titles and post tags were separately vectorized.

For both titles and tags, a token pattern of \w{2,} is used, classifying words as those that have atleast two continuous word-characters in them. English stopwords were used from nltk. Moreover, the word python is redundant since the questions are already filtered to python posts. using and use are filtered since they are words that are frequent but not relevant.

The minimum document frequency (the percentage of documents a word must appear in for it to be considered a feature) is 0.001 for the titles and 0.01 for the tags.

In [79]:
stop_words = stopwords.words('english') + ['python', 'using', 'use']

tfidf_vectorizer = TfidfVectorizer(token_pattern='\w{2,}',
                                   stop_words=stop_words,
                                   binary=True,
                                   min_df=0.001)
X3_bow = tfidf_vectorizer.fit_transform(df_X0['Title'][kmeans_cluster['ys'][3]==4])
tfidf_vectorizer_t = TfidfVectorizer(token_pattern='\w{2,}',
                                   stop_words=stop_words,
                                   binary=True,
                                   min_df=0.01)
X3_bow_t = tfidf_vectorizer_t.fit_transform(df_X0['Tags'][kmeans_cluster['ys'][3]==4])
In [80]:
X3_data = np.concatenate((X3_bow.toarray(), X3_bow_t.toarray()), axis=1)

Dimensionality reduction

In [81]:
V = np.cov(X3_data, rowvar=False)
lambdas, w = np.linalg.eig(V)
indices = np.argsort(lambdas)[::-1]
lambdas = lambdas[indices]
w = w[:, indices]
var_explained = lambdas / lambdas.sum()
fig, ax = plt.subplots(1, 2, figsize=(16,5))
ax[0].plot(np.arange(1, len(var_explained)+1), var_explained, lw=3)
ax[0].set_xlabel('Number of features', size=14)
ax[0].set_ylabel('Variance explained', size=14);

cum_var_explained = var_explained.cumsum()
# fig, ax = plt.subplots(figsize=(12,6))
n_PC = np.searchsorted(cum_var_explained, 0.8)
ax[1].plot(np.arange(1, len(cum_var_explained)+1), 
           cum_var_explained, '-', zorder=1, lw=3)
ax[1].set_ylim(ymin=0)
#ax[1].plot([-50, n_PC], [0.8, 0.8], 'r--')
ax[1].plot([n_PC, n_PC], [0, 0.8], 'r--')
ax[1].scatter(n_PC, 0.8, c='r', zorder=2)
#ax[1].set_xlim(-50,)
ax[1].set_xlabel('Features', size=14)
ax[1].set_ylabel('Cumulative variance explained', size=14);

rate = (var_explained[:-1] - var_explained[1:])/var_explained[:-1]
for m in range(len(rate)):
    if rate[m] < 0.001:
        break
m += 1
ax[0].axvline(m, c='r');
In [82]:
X3_bow_n = TruncatedSVD(n_components=n_PC, random_state=1337).fit_transform(X3_data)

Clustering

In [ ]:
k_start, k_stop = (2, 11)
X3_kmeans_cluster = cluster_range(X3_bow_n, KMeans(random_state=1337), 
                               k_start=k_start, k_stop=k_stop)    
Internal validations
In [85]:
fig, ax = plt.subplots(1,5, figsize=(16,3))
ax[0].plot(range(k_start, k_stop+1), X3_kmeans_cluster['inertias'], 'o--')
ax[0].set_xlabel('k')
ax[0].set_title('inertias');
ax[0].grid(which='both')
ax[1].plot(range(k_start, k_stop+1), X3_kmeans_cluster['iidrs'], 'o--')
ax[1].set_xlabel('k')
ax[1].set_title('intra-to-inter');
ax[1].grid(which='both')
ax[2].plot(range(k_start, k_stop+1), X3_kmeans_cluster['chs'], 'o--')
ax[2].set_xlabel('k')
ax[2].set_title('calinski-harabaz score');
ax[2].grid(which='both')
ax[3].plot(range(k_start, k_stop+1), X3_kmeans_cluster['scs'], 'o--')
ax[3].set_xlabel('k')
ax[3].set_title('silhouette score');
ax[3].grid(which='both')
k=6
ax[0].axvline(k, c='r', ls='--');
ax[1].axvline(k, c='r', ls='--');
ax[2].axvline(k, c='r', ls='--');
ax[3].axvline(k, c='r', ls='--');
sizes = []
for i in range(k_stop-k_start+1):
    sizes.append(list(np.unique(X3_kmeans_cluster['ys'][i], return_counts=True)[1]))
for i in range(len(sizes)):
    j_prev = 0
    colors = list(zip(*[np.linspace(0,0.8,len(sizes[i])+1) for _ in range(3)]))

    n = 0
    for j in sizes[i]:
        ax[4].bar(i+1, j, width=0.6, bottom=j_prev, color=colors[n])
        j_prev += j
        n+=1
ax[4].set_title('cluster nos');
Word clouds of the identified clusters
In [90]:
from wordcloud import WordCloud
k = 5
y_kmeans = X3_kmeans_cluster['ys'][k-k_start]
fig, ax = plt.subplots(1,5, figsize=(16,3))
for i in range(k):
    docs = X3_data[np.where(y_kmeans==i)[0]]
    dom_feat = np.argsort(docs.sum(axis=0))[::-1][:50]
    feat_wgt = docs.sum(axis=0)[dom_feat]
    dom_feat = np.array(tfidf_vectorizer.get_feature_names()
                        + tfidf_vectorizer_t.get_feature_names())[dom_feat]
    word_cld = {i[0]:int(i[1])+1 for i in zip(dom_feat, feat_wgt)}
    wordcloud = WordCloud(width = 800, height = 800, 
                          background_color ='white', 
                          collocations = False).generate_from_frequencies(word_cld)
    ax[i].imshow(wordcloud)
    ax[i].axis('off')
    

In [87]:
wordcloud0 =  pickle.load(open(path_+'wordcloud0', 'rb'))
wordcloud1 =  pickle.load(open(path_+'wordcloud1', 'rb'))
wordcloud2 =  pickle.load(open(path_+'wordcloud2', 'rb'))
wordcloud3 =  pickle.load(open(path_+'wordcloud3', 'rb'))
wordcloud4 =  pickle.load(open(path_+'wordcloud4', 'rb'))
In [88]:
fig, ax = plt.subplots(10, 6, figsize=(15,25), sharey=False, dpi=200)
plt.subplots_adjust(wspace=0, hspace=0)
for i in range(10):
    ax[i,0].set_axis_off();
    for j in range(1,6):
        ax[i,j].spines['top'].set_color('Grey')
        ax[i,j].spines['left'].set_color('Grey')
        ax[i,j].spines['bottom'].set_color('Grey')
        ax[i,j].spines['right'].set_color('Grey')
        ax[i,j].set_xticks([])
        ax[i,j].set_yticks([])
ax[0,1].set_title('$Hot$ $Posts$\nHi View, Hi Eng');
ax[0,2].set_title('$Trending$\nMod View, Hi Eng');
ax[0,3].set_title('$S.O.S$\nMod View, Low Eng');
ax[0,4].set_title('$Curious$ $Topics$\nLow View, Hi Eng');
ax[0,5].set_title('$Spam$\nLow View, Low Eng');
ax[0,1].imshow(wordcloud4[0], aspect='auto');
ax[9,1].imshow(wordcloud4[1], aspect='auto');
ax[1,1].imshow(wordcloud4[2], aspect='auto');
ax[5,1].imshow(wordcloud4[3], aspect='auto');
ax[6,1].imshow(wordcloud4[4], aspect='auto');
cmap_g = LinearSegmentedColormap.from_list('mycmap', ['#11644D', '#A0B046'])
ax[3,4].imshow(wordcloud3[0].recolor(colormap=cmap_g), aspect='auto');
ax[2,4].imshow(wordcloud3[1].recolor(colormap=cmap_g), aspect='auto');
ax[9,4].imshow(wordcloud3[2].recolor(colormap=cmap_g), aspect='auto');
ax[5,4].imshow(wordcloud3[3].recolor(colormap=cmap_g), aspect='auto');
ax[1,4].imshow(wordcloud3[4].recolor(colormap=cmap_g), aspect='auto');
ax[4,4].imshow(wordcloud3[5].recolor(colormap=cmap_g), aspect='auto');
cmap_p = LinearSegmentedColormap.from_list('mycmap', ['#DDA0DD', '#4B0082'])
cmap_b = LinearSegmentedColormap.from_list('mycmap', ['#107FC9', '#0B108C'])
cmap_bg = LinearSegmentedColormap.from_list('mycmap', ['#20B2AA', '#008080'])
ax[0,2].imshow(wordcloud0[0].recolor(colormap=cmap_p), aspect='auto');
ax[5,2].imshow(wordcloud0[1].recolor(colormap=cmap_p), aspect='auto');
ax[1,2].imshow(wordcloud0[2].recolor(colormap=cmap_p), aspect='auto');
ax[6,2].imshow(wordcloud0[3].recolor(colormap=cmap_p), aspect='auto');
ax[9,2].imshow(wordcloud0[4].recolor(colormap=cmap_p), aspect='auto');
ax[2,2].imshow(wordcloud0[5].recolor(colormap=cmap_p), aspect='auto');
ax[2,3].imshow(wordcloud1[0].recolor(colormap=cmap_b), aspect='auto');
ax[7,3].imshow(wordcloud1[1].recolor(colormap=cmap_b), aspect='auto');
ax[5,3].imshow(wordcloud1[2].recolor(colormap=cmap_b), aspect='auto');
ax[9,3].imshow(wordcloud1[3].recolor(colormap=cmap_b), aspect='auto');
ax[8,3].imshow(wordcloud1[4].recolor(colormap=cmap_b), aspect='auto');
ax[0,3].imshow(wordcloud1[5].recolor(colormap=cmap_b), aspect='auto');
ax[1,3].imshow(wordcloud1[6].recolor(colormap=cmap_b), aspect='auto');
ax[6,5].imshow(wordcloud2[0].recolor(colormap=cmap_bg), aspect='auto');
ax[0,5].imshow(wordcloud2[1].recolor(colormap=cmap_bg), aspect='auto');
ax[9,5].imshow(wordcloud2[2].recolor(colormap=cmap_bg), aspect='auto');
ax[5,5].imshow(wordcloud2[3].recolor(colormap=cmap_bg), aspect='auto');
ax[3,5].imshow(wordcloud2[4].recolor(colormap=cmap_bg), aspect='auto');
ax[0,0].annotate('$Pandas$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[1,0].annotate('$Numpy$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[2,0].annotate('$String$\n$Processing$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[3,0].annotate('$Lists$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[4,0].annotate('$Classes$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[5,0].annotate('$Django$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[6,0].annotate('$Matplotlib$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[7,0].annotate('$Web$\n$Interfaces$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[8,0].annotate('$Web$\n$Engines$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[9,0].annotate('$Mix$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
Out[88]:
Text(0.3, 0.5, '$Mix$')

Looking at the non-numerical clusters for the different numerical clusters. There are some differences, however, for the features in the cluster for the different numerical clusters. For example for the numpy clusters, the Trending, S.O.S, and Curious Topics also have scipy along with numpy. However, for the Hot posts, the scipy feature is not visible. This can be attributed to the target of scipy which is the scientific community, a smaller proportion of the programming community.

In [ ]: